In [1]:
import warnings
warnings.filterwarnings('ignore') 


import pandas as pd
rainfall_data = pd.read_csv("rainfall_area-wt_India_1901-2015.csv")

print(rainfall_data.head())
  REGION  YEAR   JAN   FEB   MAR   APR   MAY    JUN    JUL    AUG    SEP  \
0  INDIA  1901  34.7  37.7  18.0  39.3  50.8  113.4  242.2  272.9  124.4   
1  INDIA  1902   7.4   4.3  19.0  43.5  48.3  108.8  284.0  199.7  201.5   
2  INDIA  1903  17.0   8.3  31.3  17.1  59.5  118.3  297.0  270.4  199.1   
3  INDIA  1904  14.4   9.6  31.8  33.1  72.4  164.8  261.0  206.4  129.6   
4  INDIA  1905  25.3  20.9  42.7  33.7  55.7   93.3  252.8  200.8  178.4   

     OCT   NOV   DEC  ANNUAL  Jan-Feb  Mar-May  Jun-Sep  Oct-Dec  
0   52.7  38.0   8.3  1032.3     72.4    108.1    752.8     99.0  
1   61.5  27.9  24.4  1030.2     11.7    110.8    794.0    113.8  
2  117.9  36.9  17.7  1190.5     25.3    107.9    884.8    172.5  
3   69.0  11.2  16.3  1019.8     24.0    137.4    761.8     96.6  
4   51.4   9.7  10.5   975.3     46.2    132.2    725.4     71.6  
In [2]:
import plotly.graph_objects as go
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "notebook"

import numpy as np

# Analyze annual rainfall trends with additional statistical insights
yearly_data = rainfall_data[['YEAR', 'ANNUAL']].copy()

# Calculate moving average for trend analysis
yearly_data['rolling_avg'] = yearly_data['ANNUAL'].rolling(window=10, center=True).mean()

# Create enhanced annual rainfall visualization
fig_trends = go.Figure()

# Add main rainfall line
fig_trends.add_trace(go.Scatter(
    x=yearly_data['YEAR'],
    y=yearly_data['ANNUAL'],
    mode='lines+markers',
    name='Annual Rainfall',
    line=dict(color='steelblue', width=1.5),
    marker=dict(size=3, opacity=0.6),
    opacity=0.8
))

# Add 10-year moving average
fig_trends.add_trace(go.Scatter(
    x=yearly_data['YEAR'],
    y=yearly_data['rolling_avg'],
    mode='lines',
    name='10-Year Moving Average',
    line=dict(color='darkred', width=3)
))

# Add overall mean line
mean_rainfall = yearly_data['ANNUAL'].mean()
fig_trends.add_trace(go.Scatter(
    x=yearly_data['YEAR'],
    y=[mean_rainfall] * len(yearly_data),
    mode='lines',
    name=f'Overall Mean ({mean_rainfall:.1f}mm)',
    line=dict(color='orange', dash='dot', width=2)
))

fig_trends.update_layout(
    title='Annual Rainfall Patterns in India with Trend Analysis (1901-2015)',
    xaxis_title='Year',
    yaxis_title='Rainfall (mm)',
    template='plotly_white',
    hovermode='x unified',
    width=1000,
    height=550,
    showlegend=True
)

fig_trends.show()

# Enhanced monthly rainfall analysis with seasonal highlighting
month_names = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']
monthly_stats = rainfall_data[month_names].mean()

# Define seasonal colors
season_colors = ['lightblue', 'lightblue', 'lightgreen', 'lightgreen', 'lightgreen', 
                'cyan', 'cyan', 'cyan', 'cyan', 'orange', 'orange', 'lightblue']

# Find peak and minimum months
peak_month = monthly_stats.idxmax()
dry_month = monthly_stats.idxmin()

fig_monthly_enhanced = go.Figure()

fig_monthly_enhanced.add_trace(go.Bar(
    x=monthly_stats.index,
    y=monthly_stats.values,
    marker_color=season_colors,
    marker_line_color='green',
    marker_line_width=1.5,
    text=[f'{val:.0f}' for val in monthly_stats.values],
    textposition='outside',
    name='Monthly Rainfall'
))

# Add mean line
fig_monthly_enhanced.add_hline(
    y=monthly_stats.mean(),
    line_dash="dash",
    line_color="red",
    line_width=2,
    annotation_text=f"Monthly Average: {monthly_stats.mean():.0f}mm",
    annotation_position="top left"
)

fig_monthly_enhanced.update_layout(
    title=f'Monthly Rainfall Distribution - Peak: {peak_month} ({monthly_stats[peak_month]:.0f}mm), Lowest: {dry_month} ({monthly_stats[dry_month]:.0f}mm)',
    xaxis_title='Month',
    yaxis_title='Average Rainfall (mm)',
    template='plotly_white',
    width=1000,
    height=550,
    showlegend=False
)

fig_monthly_enhanced.show()

# Advanced seasonal analysis with comparative insights
seasons = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec']
season_labels = ['Winter', 'Pre-Monsoon', 'Monsoon', 'Post-Monsoon']
seasonal_data = rainfall_data[seasons].mean()

# Calculate percentage contribution of each season
total_seasonal = seasonal_data.sum()
seasonal_percentages = (seasonal_data / total_seasonal * 100).round(1)

fig_seasonal_pie = go.Figure()

# Create a combination chart - bar and pie
fig_seasonal_enhanced = go.Figure()

fig_seasonal_enhanced.add_trace(go.Bar(
    x=season_labels,
    y=seasonal_data.values,
    marker_color=['lightsteelblue', 'gold', 'forestgreen', 'darkorange'],
    marker_line_color='black',
    marker_line_width=2,
    text=[f'{val:.0f}mm<br>({pct}%)' for val, pct in zip(seasonal_data.values, seasonal_percentages)],
    textposition='outside',
    name='Seasonal Rainfall'
))

fig_seasonal_enhanced.update_layout(
    title=f'Seasonal Rainfall Distribution with Percentage Contribution<br><sub>    - Monsoon dominates with {seasonal_percentages.iloc[2]:.1f}% of annual rainfall</sub>',
    xaxis_title='Season',
    yaxis_title='Average Rainfall (mm)',
    template='plotly_white',
    height=550,
    width=1000,
    showlegend=False,
)

fig_seasonal_enhanced.show()

# Summary statistics
print(f"\n📊 RAINFALL ANALYSIS SUMMARY")
print(f"{'='*50}")
print(f"Peak Rainfall Month: {peak_month} ({monthly_stats[peak_month]:.0f}mm)")
print(f"Driest Month: {dry_month} ({monthly_stats[dry_month]:.0f}mm)")
print(f"Annual Average: {mean_rainfall:.0f}mm")
print(f"Monsoon Season Contribution: {seasonal_percentages.iloc[2]:.1f}% of total rainfall")
📊 RAINFALL ANALYSIS SUMMARY
==================================================
Peak Rainfall Month: JUL (291mm)
Driest Month: DEC (15mm)
Annual Average: 1182mm
Monsoon Season Contribution: 75.3% of total rainfall
In [3]:
from scipy.stats import pearsonr, spearmanr
import pandas as pd
import numpy as np
from prettytable import PrettyTable

# Enhanced drought and extreme rainfall analysis with multiple statistical approaches
annual_mean = rainfall_data['ANNUAL'].mean()
annual_std = rainfall_data['ANNUAL'].std()
annual_median = rainfall_data['ANNUAL'].median()

# Define multiple severity thresholds for comprehensive analysis
moderate_drought_threshold = annual_mean - 1.0 * annual_std
severe_drought_threshold = annual_mean - 1.5 * annual_std
extreme_drought_threshold = annual_mean - 2.0 * annual_std

moderate_excess_threshold = annual_mean + 1.0 * annual_std
severe_excess_threshold = annual_mean + 1.5 * annual_std
extreme_excess_threshold = annual_mean + 2.0 * annual_std

# Categorize years by rainfall severity with enhanced classification
def classify_rainfall_year(annual_rainfall):
    if annual_rainfall <= extreme_drought_threshold:
        return 'Extreme Drought'
    elif annual_rainfall <= severe_drought_threshold:
        return 'Severe Drought'
    elif annual_rainfall <= moderate_drought_threshold:
        return 'Moderate Drought'
    elif annual_rainfall >= extreme_excess_threshold:
        return 'Extreme Excess'
    elif annual_rainfall >= severe_excess_threshold:
        return 'Severe Excess'
    elif annual_rainfall >= moderate_excess_threshold:
        return 'Moderate Excess'
    else:
        return 'Normal'

# Apply classification and calculate anomaly scores
rainfall_data_enhanced = rainfall_data.copy()
rainfall_data_enhanced['Rainfall_Category'] = rainfall_data_enhanced['ANNUAL'].apply(classify_rainfall_year)
rainfall_data_enhanced['Anomaly_Score'] = (rainfall_data_enhanced['ANNUAL'] - annual_mean) / annual_std
rainfall_data_enhanced['Percentile_Rank'] = rainfall_data_enhanced['ANNUAL'].rank(pct=True) * 100

# Filter extreme years with additional metrics
drought_years_enhanced = rainfall_data_enhanced[
    rainfall_data_enhanced['ANNUAL'] <= severe_drought_threshold
].copy()

excess_years_enhanced = rainfall_data_enhanced[
    rainfall_data_enhanced['ANNUAL'] >= severe_excess_threshold
].copy()

# Enhanced seasonal correlation analysis with multiple correlation methods
seasonal_periods = {
    'Winter': 'Jan-Feb',
    'Pre_Monsoon': 'Mar-May', 
    'Monsoon': 'Jun-Sep',
    'Post_Monsoon': 'Oct-Dec'
}

correlation_analysis = {}
for season_name, season_col in seasonal_periods.items():
    pearson_corr, pearson_p = pearsonr(rainfall_data[season_col], rainfall_data['ANNUAL'])
    spearman_corr, spearman_p = spearmanr(rainfall_data[season_col], rainfall_data['ANNUAL'])
    
    correlation_analysis[season_name] = {
        'Pearson_Correlation': pearson_corr,
        'Pearson_P_Value': pearson_p,
        'Spearman_Correlation': spearman_corr,
        'Spearman_P_Value': spearman_p,
        'Significance': 'Significant' if pearson_p < 0.05 else 'Not Significant'
    }

# Create comprehensive summary dataframes
drought_summary_enhanced = drought_years_enhanced[
    ['YEAR', 'ANNUAL', 'Rainfall_Category', 'Anomaly_Score', 'Percentile_Rank']
].sort_values('ANNUAL').reset_index(drop=True)

excess_summary_enhanced = excess_years_enhanced[
    ['YEAR', 'ANNUAL', 'Rainfall_Category', 'Anomaly_Score', 'Percentile_Rank']
].sort_values('ANNUAL', ascending=False).reset_index(drop=True)

correlation_summary_enhanced = pd.DataFrame.from_dict(correlation_analysis, orient='index')

# Calculate additional statistical insights
rainfall_statistics = {
    'Total_Years_Analyzed': len(rainfall_data),
    'Mean_Annual_Rainfall': annual_mean,
    'Standard_Deviation': annual_std,
    'Median_Rainfall': annual_median,
    'Drought_Years_Count': len(drought_years_enhanced),
    'Excess_Years_Count': len(excess_years_enhanced),
    'Drought_Frequency_Percent': (len(drought_years_enhanced) / len(rainfall_data)) * 100,
    'Excess_Frequency_Percent': (len(excess_years_enhanced) / len(rainfall_data)) * 100
}

statistics_summary = pd.DataFrame.from_dict(rainfall_statistics, orient='index', columns=['Value'])

# Create pretty tables for display
print("🌧️  ENHANCED RAINFALL ANALYSIS RESULTS")
print("=" * 60)

# Drought Years Table
print(f"\n📉 DROUGHT YEARS (≤ {severe_drought_threshold:.1f}mm)")
print("-" * 60)
drought_table = PrettyTable()
drought_table.field_names = ["Year", "Rainfall (mm)", "Category", "Anomaly Score", "Percentile Rank"]
drought_table.align["Year"] = "c"
drought_table.align["Rainfall (mm)"] = "r"
drought_table.align["Category"] = "l"
drought_table.align["Anomaly Score"] = "r"
drought_table.align["Percentile Rank"] = "r"

for _, row in drought_summary_enhanced.iterrows():
    drought_table.add_row([
        int(row['YEAR']),
        f"{row['ANNUAL']:.1f}",
        row['Rainfall_Category'],
        f"{row['Anomaly_Score']:.2f}",
        f"{row['Percentile_Rank']:.1f}%"
    ])

print(drought_table)

# Excess Years Table
print(f"\n📈 EXCESS RAINFALL YEARS (≥ {severe_excess_threshold:.1f}mm)")
print("-" * 60)
excess_table = PrettyTable()
excess_table.field_names = ["Year", "Rainfall (mm)", "Category", "Anomaly Score", "Percentile Rank"]
excess_table.align["Year"] = "c"
excess_table.align["Rainfall (mm)"] = "r"
excess_table.align["Category"] = "l"
excess_table.align["Anomaly Score"] = "r"
excess_table.align["Percentile Rank"] = "r"

for _, row in excess_summary_enhanced.iterrows():
    excess_table.add_row([
        int(row['YEAR']),
        f"{row['ANNUAL']:.1f}",
        row['Rainfall_Category'],
        f"{row['Anomaly_Score']:.2f}",
        f"{row['Percentile_Rank']:.1f}%"
    ])

print(excess_table)

# Seasonal Correlation Table
print(f"\n🔄 SEASONAL CORRELATION ANALYSIS")
print("-" * 60)
correlation_table = PrettyTable()
correlation_table.field_names = ["Season", "Pearson Corr", "Pearson P-Val", "Spearman Corr", "Spearman P-Val", "Significance"]
correlation_table.align["Season"] = "l"
correlation_table.align["Pearson Corr"] = "r"
correlation_table.align["P-Value"] = "r"
correlation_table.align["Spearman Corr"] = "r"
correlation_table.align["Significance"] = "c"

for season, data in correlation_analysis.items():
    correlation_table.add_row([
        season,
        f"{data['Pearson_Correlation']:.4f}",
        f"{data['Pearson_P_Value']:.4f}",
        f"{data['Spearman_Correlation']:.4f}",
        f"{data['Spearman_P_Value']:.4f}",
        data['Significance']
    ])

print(correlation_table)

# Overall Statistics Table
print(f"\n📊 OVERALL STATISTICS SUMMARY")
print("-" * 60)
stats_table = PrettyTable()
stats_table.field_names = ["Statistic", "Value"]
stats_table.align["Statistic"] = "l"
stats_table.align["Value"] = "r"

# Format values appropriately
formatted_stats = {
    'Total Years Analyzed': f"{rainfall_statistics['Total_Years_Analyzed']}",
    'Mean Annual Rainfall': f"{rainfall_statistics['Mean_Annual_Rainfall']:.1f} mm",
    'Standard Deviation': f"{rainfall_statistics['Standard_Deviation']:.1f} mm",
    'Median Rainfall': f"{rainfall_statistics['Median_Rainfall']:.1f} mm",
    'Drought Years Count': f"{rainfall_statistics['Drought_Years_Count']}",
    'Excess Years Count': f"{rainfall_statistics['Excess_Years_Count']}",
    'Drought Frequency': f"{rainfall_statistics['Drought_Frequency_Percent']:.1f}%",
    'Excess Frequency': f"{rainfall_statistics['Excess_Frequency_Percent']:.1f}%"
}

for stat, value in formatted_stats.items():
    stats_table.add_row([stat, value])

print(stats_table)

# Summary insights
print(f"\n💡 KEY INSIGHTS")
print("-" * 60)
insights_table = PrettyTable()
insights_table.field_names = ["Insight", "Description"]
insights_table.align["Insight"] = "l"
insights_table.align["Description"] = "l"
insights_table.max_width["Description"] = 50

# Find most correlated season
most_correlated_season = max(correlation_analysis.keys(), 
                           key=lambda x: abs(correlation_analysis[x]['Pearson_Correlation']))

insights_table.add_row([
    "Driest Year", 
    f"{int(drought_summary_enhanced.iloc[0]['YEAR'])} ({drought_summary_enhanced.iloc[0]['ANNUAL']:.1f}mm)"
])
insights_table.add_row([
    "Wettest Year", 
    f"{int(excess_summary_enhanced.iloc[0]['YEAR'])} ({excess_summary_enhanced.iloc[0]['ANNUAL']:.1f}mm)"
])
insights_table.add_row([
    "Most Influential Season", 
    f"{most_correlated_season} (r={correlation_analysis[most_correlated_season]['Pearson_Correlation']:.3f})"
])
insights_table.add_row([
    "Extreme Event Frequency", 
    f"{(len(drought_years_enhanced) + len(excess_years_enhanced))} years ({((len(drought_years_enhanced) + len(excess_years_enhanced))/len(rainfall_data) * 100):.1f}%)"
])

print(insights_table)
🌧️  ENHANCED RAINFALL ANALYSIS RESULTS
============================================================

📉 DROUGHT YEARS (≤ 1016.0mm)
------------------------------------------------------------
+------+---------------+-----------------+---------------+-----------------+
| Year | Rainfall (mm) | Category        | Anomaly Score | Percentile Rank |
+------+---------------+-----------------+---------------+-----------------+
| 2002 |         920.8 | Extreme Drought |         -2.36 |            0.9% |
| 1965 |         938.4 | Extreme Drought |         -2.20 |            1.7% |
| 1972 |         948.5 | Extreme Drought |         -2.11 |            2.6% |
| 2009 |         959.3 | Extreme Drought |         -2.01 |            3.5% |
| 1905 |         975.3 | Severe Drought  |         -1.87 |            4.3% |
+------+---------------+-----------------+---------------+-----------------+

📈 EXCESS RAINFALL YEARS (≥ 1348.1mm)
------------------------------------------------------------
+------+---------------+----------------+---------------+-----------------+
| Year | Rainfall (mm) | Category       | Anomaly Score | Percentile Rank |
+------+---------------+----------------+---------------+-----------------+
| 1917 |        1480.3 | Extreme Excess |          2.69 |          100.0% |
| 1961 |        1403.0 | Severe Excess  |          2.00 |           99.1% |
| 1990 |        1400.6 | Severe Excess  |          1.97 |           98.3% |
| 1933 |        1393.5 | Severe Excess  |          1.91 |           97.4% |
| 1956 |        1386.2 | Severe Excess  |          1.84 |           96.5% |
| 1959 |        1382.1 | Severe Excess  |          1.81 |           95.7% |
| 1988 |        1351.0 | Severe Excess  |          1.53 |           94.8% |
+------+---------------+----------------+---------------+-----------------+

🔄 SEASONAL CORRELATION ANALYSIS
------------------------------------------------------------
+--------------+--------------+---------------+---------------+----------------+--------------+
| Season       | Pearson Corr | Pearson P-Val | Spearman Corr | Spearman P-Val | Significance |
+--------------+--------------+---------------+---------------+----------------+--------------+
| Winter       |       0.2289 |     0.0139    |        0.2301 |     0.0134     | Significant  |
| Pre_Monsoon  |       0.3131 |     0.0007    |        0.2897 |     0.0017     | Significant  |
| Monsoon      |       0.9300 |     0.0000    |        0.9124 |     0.0000     | Significant  |
| Post_Monsoon |       0.5316 |     0.0000    |        0.4755 |     0.0000     | Significant  |
+--------------+--------------+---------------+---------------+----------------+--------------+

📊 OVERALL STATISTICS SUMMARY
------------------------------------------------------------
+----------------------+-----------+
| Statistic            |     Value |
+----------------------+-----------+
| Total Years Analyzed |       115 |
| Mean Annual Rainfall | 1182.0 mm |
| Standard Deviation   |  110.7 mm |
| Median Rainfall      | 1190.5 mm |
| Drought Years Count  |         5 |
| Excess Years Count   |         7 |
| Drought Frequency    |      4.3% |
| Excess Frequency     |      6.1% |
+----------------------+-----------+

💡 KEY INSIGHTS
------------------------------------------------------------
+-------------------------+-------------------+
| Insight                 | Description       |
+-------------------------+-------------------+
| Driest Year             | 2002 (920.8mm)    |
| Wettest Year            | 1917 (1480.3mm)   |
| Most Influential Season | Monsoon (r=0.930) |
| Extreme Event Frequency | 12 years (10.4%)  |
+-------------------------+-------------------+
In [4]:
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
import numpy as np
from plotly.subplots import make_subplots

# Calculate monthly anomalies for each month across all years
monthly_columns = ['JAN', 'FEB', 'MAR', 'APR', 'MAY', 'JUN', 'JUL', 'AUG', 'SEP', 'OCT', 'NOV', 'DEC']

# Calculate long-term monthly means and standard deviations
monthly_means = rainfall_data[monthly_columns].mean()
monthly_stds = rainfall_data[monthly_columns].std()

# Define anomaly thresholds (1.5 standard deviations from mean)
anomaly_threshold = 1.5

# Create anomaly detection dataframe
anomaly_data = rainfall_data.copy()

# Calculate anomalies for each month
for month in monthly_columns:
    anomaly_data[f'{month}_anomaly'] = (anomaly_data[month] - monthly_means[month]) / monthly_stds[month]
    anomaly_data[f'{month}_is_anomaly'] = np.abs(anomaly_data[f'{month}_anomaly']) >= anomaly_threshold

# Create comprehensive monthly anomaly visualization
fig_monthly_anomalies = make_subplots(
    rows=4, cols=3,
    subplot_titles=monthly_columns,
    shared_yaxes=True,
    vertical_spacing=0.08,
    horizontal_spacing=0.06
)

# Color scheme for anomalies
normal_color = 'lightblue'
anomaly_color = 'red'

for i, month in enumerate(monthly_columns):
    row = (i // 3) + 1
    col = (i % 3) + 1
    
    # Get colors for each year based on anomaly status
    colors = [anomaly_color if is_anomaly else normal_color 
              for is_anomaly in anomaly_data[f'{month}_is_anomaly']]
    
    fig_monthly_anomalies.add_trace(
        go.Scatter(
            x=anomaly_data['YEAR'],
            y=anomaly_data[month],
            mode='markers',
            marker=dict(
                color=colors,
                size=6,
                line=dict(width=1, color='darkblue')
            ),
            name=month,
            showlegend=False,
            hovertemplate=f'<b>{month}</b><br>' +
                         'Year: %{x}<br>' +
                         'Rainfall: %{y:.1f}mm<br>' +
                         '<extra></extra>'
        ),
        row=row, col=col
    )
    
    # Add mean line
    fig_monthly_anomalies.add_hline(
        y=monthly_means[month],
        line_dash="dash",
        line_color="green",
        line_width=1,
        row=row, col=col
    )
    
    # Add anomaly threshold lines
    upper_threshold = monthly_means[month] + anomaly_threshold * monthly_stds[month]
    lower_threshold = monthly_means[month] - anomaly_threshold * monthly_stds[month]
    
    fig_monthly_anomalies.add_hline(
        y=upper_threshold,
        line_dash="dot",
        line_color="orange",
        line_width=1,
        row=row, col=col
    )
    
    fig_monthly_anomalies.add_hline(
        y=lower_threshold,
        line_dash="dot",
        line_color="orange",
        line_width=1,
        row=row, col=col
    )

fig_monthly_anomalies.update_layout(
    title='Monthly Rainfall Anomalies Across All Years (1901-2015)<br><sub>Red dots indicate anomalies (>1.5σ from mean)</sub>',
    height=1800,
    width=1500,
    template='plotly_white'
)

fig_monthly_anomalies.update_xaxes(title_text="Year")
fig_monthly_anomalies.update_yaxes(title_text="Rainfall (mm)")

fig_monthly_anomalies.show()

# Option 4: Box Plot of Anomaly Magnitudes by Month
fig_boxplot = go.Figure()

for month in monthly_columns:
    anomaly_values = anomaly_data[anomaly_data[f'{month}_is_anomaly']][f'{month}_anomaly']
    
    if len(anomaly_values) > 0:
        fig_boxplot.add_trace(go.Box(
            y=anomaly_values,
            name=month,
            boxpoints='all',
            pointpos=0,
            marker=dict(color=anomaly_color, size=4),
            line=dict(color='darkred'),
            fillcolor='rgba(255,0,0,0.3)'
        ))

fig_boxplot.update_layout(
    title='Distribution of Anomaly Magnitudes by Month<br><sub>Statistical spread of extreme events - reveals which months show most variable anomaly patterns</sub>',
    xaxis_title='Month',
    yaxis_title='Anomaly Score (σ)',
    height=600,
    width=1200,
    template='plotly_white'
)

fig_boxplot.show()

# Create anomaly summary statistics
anomaly_counts = {}
extreme_years = {}

for month in monthly_columns:
    anomaly_years = anomaly_data[anomaly_data[f'{month}_is_anomaly']]
    anomaly_counts[month] = len(anomaly_years)
    
    if len(anomaly_years) > 0:
        # Find most extreme year for this month
        max_anomaly_idx = np.abs(anomaly_years[f'{month}_anomaly']).idxmax()
        extreme_years[month] = {
            'year': int(anomaly_years.loc[max_anomaly_idx, 'YEAR']),
            'rainfall': anomaly_years.loc[max_anomaly_idx, month],
            'anomaly_score': anomaly_years.loc[max_anomaly_idx, f'{month}_anomaly']
        }

# Anomaly frequency bar chart
fig_anomaly_freq = go.Figure()

fig_anomaly_freq.add_trace(go.Bar(
    x=monthly_columns,
    y=[anomaly_counts[month] for month in monthly_columns],
    marker_color=['red' if anomaly_counts[month] > 0 else 'lightgray' for month in monthly_columns],
    marker_line_color='darkred',
    marker_line_width=1.5,
    text=[anomaly_counts[month] for month in monthly_columns],
    textposition='outside'
))

fig_anomaly_freq.update_layout(
    title='Frequency of Monthly Rainfall Anomalies (1901-2015)<br><sub>Number of years each month experienced anomalous rainfall</sub>',
    xaxis_title='Month',
    yaxis_title='Number of Anomalous Years',
    template='plotly_white',
    height=500,
    width=1000,
    showlegend=False
)

fig_anomaly_freq.show()

# Summary statistics
total_anomalies = sum(anomaly_counts.values())
most_anomalous_month = max(anomaly_counts.keys(), key=lambda x: anomaly_counts[x])
least_anomalous_month = min(anomaly_counts.keys(), key=lambda x: anomaly_counts[x])

print(f"\n🔍 MONTHLY RAINFALL ANOMALY ANALYSIS")
print("=" * 60)
print(f"📊 Total Anomalous Months: {total_anomalies}")
print(f"📈 Most Anomalous Month: {most_anomalous_month} ({anomaly_counts[most_anomalous_month]} years)")
print(f"📉 Least Anomalous Month: {least_anomalous_month} ({anomaly_counts[least_anomalous_month]} years)")
print(f"⚡ Anomaly Threshold: ±{anomaly_threshold} standard deviations")

print(f"\n🎯 MOST EXTREME EVENTS BY MONTH")
print("-" * 60)
for month in monthly_columns:
    if month in extreme_years:
        extreme = extreme_years[month]
        print(f"{month}: {extreme['year']} ({extreme['rainfall']:.1f}mm, σ={extreme['anomaly_score']:.2f})")
    else:
        print(f"{month}: No anomalies detected")

# Calculate decade-wise anomaly trends
anomaly_data['DECADE'] = (anomaly_data['YEAR'] // 10) * 10
decade_anomalies = anomaly_data.groupby('DECADE').apply(
    lambda x: sum([x[f'{month}_is_anomaly'].sum() for month in monthly_columns])
).reset_index()
decade_anomalies.columns = ['DECADE', 'ANOMALY_COUNT']

print(f"\n📅 ANOMALIES BY DECADE")
print("-" * 60)
for _, row in decade_anomalies.iterrows():
    decade_label = f"{int(row['DECADE'])}s"
    print(f"{decade_label}: {int(row['ANOMALY_COUNT'])} anomalous months")
🔍 MONTHLY RAINFALL ANOMALY ANALYSIS
============================================================
📊 Total Anomalous Months: 158
📈 Most Anomalous Month: FEB (17 years)
📉 Least Anomalous Month: DEC (9 years)
⚡ Anomaly Threshold: ±1.5 standard deviations

🎯 MOST EXTREME EVENTS BY MONTH
------------------------------------------------------------
JAN: 1943 (58.5mm, σ=3.88)
FEB: 1937 (53.8mm, σ=2.64)
MAR: 1967 (63.3mm, σ=2.85)
APR: 2015 (69.4mm, σ=3.01)
MAY: 1990 (114.5mm, σ=3.34)
JUN: 1938 (275.5mm, σ=3.01)
JUL: 2002 (138.9mm, σ=-3.70)
AUG: 1926 (335.5mm, σ=2.20)
SEP: 1917 (281.0mm, σ=2.96)
OCT: 1917 (158.8mm, σ=2.94)
NOV: 1979 (74.2mm, σ=2.79)
DEC: 1967 (54.4mm, σ=4.49)

📅 ANOMALIES BY DECADE
------------------------------------------------------------
1900s: 21 anomalous months
1910s: 20 anomalous months
1920s: 17 anomalous months
1930s: 13 anomalous months
1940s: 13 anomalous months
1950s: 17 anomalous months
1960s: 9 anomalous months
1970s: 13 anomalous months
1980s: 7 anomalous months
1990s: 10 anomalous months
2000s: 10 anomalous months
2010s: 8 anomalous months
In [5]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import plotly.express as px
import plotly.graph_objects as go
import numpy as np

# Data Preparation with Enhanced Validation
seasonal_features = ['Jan-Feb', 'Mar-May', 'Jun-Sep', 'Oct-Dec', 'ANNUAL']
rainfall_features = rainfall_data[seasonal_features]

# Advanced Standardization with Variance Check
scaler = StandardScaler()
scaled_features = scaler.fit_transform(rainfall_features)
assert np.isfinite(scaled_features).all(), "Missing/infinite values detected after scaling"

# Optimal Cluster Determination using Elbow Method [1][15][17]
inertias = []
silhouettes = []
k_range = range(2, 8)

for k in k_range:
    kmeans = KMeans(n_clusters=k, n_init=10, random_state=42)
    cluster_labels = kmeans.fit_predict(scaled_features)
    inertias.append(kmeans.inertia_)
    silhouettes.append(silhouette_score(scaled_features, cluster_labels))

# Automated Elbow Detection [19]
elbow_k = np.argmax(np.diff(np.diff(inertias))**2) + 2  # 2nd derivative maximization
optimal_k = max(3, min(elbow_k, 6))  # Constrain between 3-6 clusters

# Final Clustering with Optimal K [12]
final_kmeans = KMeans(n_clusters=optimal_k, n_init=20, random_state=42)
rainfall_data['Rainfall_Cluster'] = final_kmeans.fit_predict(scaled_features)

# Cluster Labeling Based on Centroid Analysis [9][10]
centroids = scaler.inverse_transform(final_kmeans.cluster_centers_)
sorted_indices = np.argsort(centroids[:, -1])  # Sort by annual rainfall
cluster_labels = {i: ['Severe Dry', 'Dry', 'Normal', 'Wet', 'Extreme Wet'][i] 
                  for i in range(optimal_k)}
rainfall_data['Rainfall_Category'] = rainfall_data['Rainfall_Cluster'].map(cluster_labels)

# Enhanced Visualization with Cluster Trends [5][14]
fig = px.scatter(
    rainfall_data,
    x='YEAR',
    y='ANNUAL',
    color='Rainfall_Category',
    title=f'Rainfall Pattern Clustering (Optimal K={optimal_k})',
    labels={'YEAR': 'Year', 'ANNUAL': 'Annual Rainfall (mm)'},
    hover_data=seasonal_features,
    # trendline='lowess',
    # trendline_color_override='black'
)

# Add Cluster Centroids [9]
centroid_years = rainfall_data.groupby('Rainfall_Category')['YEAR'].median().values
for i, (year, centroid) in enumerate(zip(centroid_years, centroids)):
    fig.add_trace(go.Scatter(
        x=[year],
        y=[centroid[-1]],
        mode='markers',
        marker=dict(size=12, line=dict(width=2)),
        name=f'Cluster {i+1} Center'
    ))


fig.update_layout(
    template='plotly_white',
    height=700,
    legend_title_text='Rainfall Category',
    xaxis_rangeslider_visible=True
)

# Save Outputs [7]
# fig.write_html('rainfall_cluster_analysis.html')
# rainfall_data.to_csv('clustered_rainfall_data.csv', index=False)
fig.show()
In [6]:
import pandas as pd
import numpy as np
# use: pip install prophet
from prophet import Prophet
from prophet.plot import plot_plotly, plot_components_plotly

# Prepare the data for Prophet
rainfall_data['DATE'] = pd.to_datetime(rainfall_data['YEAR'].astype(str) + '-12-31')
annual_rainfall_ts = rainfall_data.set_index('DATE')['ANNUAL']

# Create Prophet dataframe
prophet_data = annual_rainfall_ts.reset_index()
prophet_data.columns = ['ds', 'y']

# Initialize and fit Prophet model
prophet_model = Prophet(
    yearly_seasonality=True,
    changepoint_prior_scale=0.1,
    interval_width=0.8
)
prophet_model.fit(prophet_data)

# Create future dataframe for the next 20 years
future = prophet_model.make_future_dataframe(periods=20, freq='YE')
forecast = prophet_model.predict(future)

# Create forecast plot
fig_forecast = plot_plotly(prophet_model, forecast)
fig_forecast.update_layout(
    title='Annual Rainfall Forecast Using Prophet (1901-2035)<sub>:20-year projection with confidence intervals based on historical patterns</sub>',
    xaxis_title='Year',
    yaxis_title='Annual Rainfall (mm)',
    # template='plotly_white',
    height=900,
    width=1500
)
fig_forecast.show()

# Show model components
fig_components = plot_components_plotly(prophet_model, forecast)
fig_components.update_layout(
    title='Prophet Model Components<br><sub>Trend and seasonal patterns in rainfall data</sub>',
    template='plotly_white',
    height=500
)
fig_components.show()

# Basic forecast summary
forecast_period = forecast[forecast['ds'] > prophet_data['ds'].max()]
historical_mean = prophet_data['y'].mean()
forecast_mean = forecast_period['yhat'].mean()

print(f"Historical Average (1901-2015): {historical_mean:.1f} mm")
print(f"Forecast Average (2016-2035): {forecast_mean:.1f} mm")
print(f"Projected Change: {forecast_mean - historical_mean:+.1f} mm ({((forecast_mean/historical_mean-1)*100):+.1f}%)")
10:39:36 - cmdstanpy - INFO - Chain [1] start processing
10:39:36 - cmdstanpy - INFO - Chain [1] done processing
Historical Average (1901-2015): 1182.0 mm
Forecast Average (2016-2035): 1104.7 mm
Projected Change: -77.3 mm (-6.5%)